rm(list=ls())
setwd("E:/data/cooperation/poor/投稿gini/data")

library(foreign)
library(xlsxjars)
library(xlsx)
library(readstata13)
library(stringr)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(randomForest)
library(e1071)
library(party)
##data mining----
da1<-read.dta13("data_mining.dta")
name<-colnames(da1)
a<-c(which(name=="year"),which(name=="id"))
da<-da1[,-a]
name<-colnames(da)
n<-dim(da)[2]
for(i in 1:dim(da)[2])
{
  da[,i]<-as.numeric(da[,i])
}
a<-vector(length = n)
for (i in 1:n)
{
  a[i]<-sum(is.na(da[,i]))/dim(da)[1]
}
re<-data.frame(name=name,missing=a)
miss_variable<-re[,2]
da_new<-da[,which(re$missing<0.25)]
##main data----
da_new<-data.frame(pov9=da$pov9,da_new)
name<-colnames(da_new)
##corelation ------
a<-1
for (i in 2:length(name))
{
  a<-cbind(a,cor.test(da_new$pov9,da_new[,i])$estimate)
}
a<-a[-1]
a<-data.frame(cor=a)
a<-abs(a)
##country identi----
da<-data.frame(id=da1$id,year=da1$year,da_new)
names(table(da$id))#202country
da2<-split(da,da$id)  #split by country
country<-labels(da2)#country name
re<-colSums(is.na(da2[[1]]))/dim(da2[[1]])[1]
for (i in 2:length(country))
{
  re1<-colSums(is.na(da2[[i]]))/dim(da2[[i]])[1]
  re<-cbind(re,re1)
}
colnames(re)<-country
write.csv(re,file="balance_country.csv")#the missing status of country 
re_country<-re
ba_country<-colMeans(re_country)
sum<-summary(ba_country)
ba<-data.frame(missing=ba_country)
##fig A1-----
miss_variable<-data.frame(prob=miss_variable,label="Variable level") ##missing probability of variables
a<-data.frame(prob=a$cor,label="Correlative coefficient")  ##corelation between variables and gini index 
ba<-data.frame(prob=ba$missing,label="National level") ##missing probability of country level
da_A1<-rbind(miss_variable,a,ba)
ggplot(da_A1,aes(x=label))+  ##boxplot
  geom_boxplot(aes(y=prob))+
  geom_jitter(aes(y=prob),size=0.8)+
  theme_bw()+
  labs(y="Probability",size=0.1,x="")+
  theme(axis.text.x = element_text(family="Arial", size=8))+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
country_name<-names(which(ba_country<=sum[5]))
write.csv(country_name,file="country.csv")
da_1<-da[which(da$id==country_name[1]),]
for (i in 2:length(country_name))
{
  da_1<-rbind(da_1,da[which(da$id==country_name[i]),])
}
da<-da_1  ###this is the main database 
name<-colnames(da)
write.dta(da,file="da_original.dta")
da_rolling<-da
a<-c(which(name=="year"),which(name=="id"))
da_new<-da[,-a]
num_test<-which(is.na(da_new$pov9))
num_train<-(1:3775)[-num_test]
num_test_original<-num_test
num_train_original<-num_train
da_test<-da_new[which(is.na(da_new$pov9)),]
da_train<-da_new[-which(is.na(da_new$pov9)),]

##predict----
set.seed(100000)

##compare the gini index ----
da<-data.frame(da,decision=1)
da1<-da
da1$decision<-"Total data"
da$decision[num_test]<-"Testing data"
da$decision[num_train]<-"Training data"
da_plot<-rbind(da,da1)
ggplot(da_plot,aes(x=decision))+
  geom_boxplot(aes(y=pov9))+
  geom_jitter(aes(y=pov9))+
  theme_bw()+
  labs(y="Gini index",x=NULL,size=0.1)+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())

summary(da_train$pov9)
summary(da_1)
summary(da$pov9)
sd(da_1,na.rm = T)
sd(da$pov9,na.rm = T)

##rolling method----
#da_rolling
name<-names(da_rolling)
a<-c(which(name=="year"),which(name=="id"))
da_new<-da_rolling[,-a]
num_test<-which(is.na(da_new$pov9))
num_train<-(1:3775)[-num_test]
for (i in 1:26)
{
  n<-sample(num_test,100)
  da_test<-da_new[n,]
  da_train<-da_new[num_train,]
  result<-randomForest(pov9~.,data=da_train,importance=TRUE, na.action=na.omit,ntree=1000)
  da_1<-predict(result,da_test)
  da_new$pov9[n]<-da_1
  da_rolling$pov9[n]<-da_1
  num_test<-num_test[-match(n,num_test)]
  num_train<-c(num_train,n)
}
da_test<-da_new[num_test,]
da_train<-da_new[num_train,]

write.dta(da_rolling,file="da_rolling.dta")

##compare the gini index ----
da_r<-data.frame(da_rolling,decision=1)
da1_r<-da_rolling
da1_r$decision<-"Total data"
da_r$decision[num_test_original]<-"Test data"
da_r$decision[num_train_original]<-"train data"
da_plot_r<-rbind(da_r,da1_r)

summary(da_rolling$pov9[num_train_original])
summary(da_rolling$pov9[num_test_original])
summary(da_rolling$pov9)
sd(da_rolling$pov9[num_test_original],na.rm = T)
sd(da_rolling$pov9,na.rm = T)

##fig A2
da_graph<-da_plot_r
da_graph<-da_graph[-which(is.na(da_graph$pov9)==1),]
da_graph$decision<-factor(da_graph$decision,levels=c("Training data","Testing data","*Testing data*","Total data","*Total data*"))
ggplot(da_graph,aes(x=decision))+
  geom_boxplot(aes(y=pov9))+
  geom_jitter(aes(y=pov9),width = 0.4,alpha=0.3,stroke=0.1,size=0.9)+
  theme_bw()+
  labs(y="Gini index",x=NULL,size=0.1)+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())


##machine learning----
da<-da_rolling
name<-colnames(da)
a<-c(which(name=="year"),which(name=="id"))
da<-da[,-a]
result<-randomForest(pov9~.,data=da,importance=TRUE, na.action=na.omit,ntree=10)
im<-importance(result)
write.csv(im,file="im.csv")

##fig 2-----
da<-read.dta13("data_mining.dta")
a<- floor((da$year-1993)/13)
a<-as.factor(a)
da<-data.frame(da,year1=a)
ggplot(da,aes(x=vdem_egaldem,y=pov9,colour=factor(year1),fill=factor(year1)))+
  geom_point()+
  geom_smooth(method="lm",formula = y ~log(x),size=1)+
  theme_bw()+
  ylim(30,50)+
  scale_color_manual(name="Year",values=c("gray40", "black"),labels=c('1993-2005','2006-2017'))+
  scale_fill_manual(name="Year",values=c("gray40", "black"),labels=c('1993-2005','2006-2017'))+
  theme(title=element_text(size=10))+
  labs(y="Gini index",x="Level of democracy")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.line = element_line())

##data processing----
da_1<-read.dta13("da_original.dta")
da<-read.dta13("data_mining.dta")
da_2<-read.dta13("da_rolling.dta")
country<-read.csv("country.csv",header=T)
da_1$id<-factor(da_1$id,levels=country[,2])
names(table(da_1$id))
country<-labels(da2)
da_1$dpi_system[which(da_1$dpi_system==1)]<-0
da_1$dpi_system[which(da_1$dpi_system==2)]<-1
da_1$dpi_system[which(da_1$dpi_system==3)]<-1
da_1$ht_colonial[which(da_1$ht_colonial==1)]<-0
da_1$ht_colonial[which(da_1$ht_colonial>1)]<-1
save.dta13(da_1,file="da_original.dta")
##as for rolling 
country<-read.csv("country.csv",header=T)
da_2$id<-factor(da_2$id,levels=country[,2])
names(table(da_2$id))
country<-labels(da2)
da_2$dpi_system[which(da_2$dpi_system==1)]<-0
da_2$dpi_system[which(da_2$dpi_system==2)]<-1
da_2$dpi_system[which(da_2$dpi_system==3)]<-1
da_2$ht_colonial[which(da_2$ht_colonial==1)]<-0
da_2$ht_colonial[which(da_2$ht_colonial>1)]<-1
save.dta13(da_2,file="da_rolling.dta")


da<-read.dta13("da_rolling.dta")
mean(da$pov9[which(da$vdem_egaldem<0.07)],na.rm = T)
mean(da$pov9[which(da$vdem_egaldem<0.495&da$vdem_egaldem>0.493)],na.rm = T)

mean(da$pov9[which(da$vdem_egaldem<0.8&da$vdem_egaldem>0.7)],na.rm = T)



##国家统计表格-----
da<-read.dta13("da_rolling.dta")

country<-names(table(da$id))
n<-length(country)
democracy<-vector(length=n)
for (i in 1:n)
{
  da_1<-da[which(da$id==country[i]),]
  democracy[i]<-mean(da_1$vdem_egaldem,na.rm = T)
}
a<-mean(da$vdem_egaldem,na.rm = T)
democracy[4]<-0
democracy[16]<-0
sum(democracy>a)

da_s<-da[which(da$dpi_system==0),]
sum(table(da_s$id)>12)

b<-mean(da$pov9,na.rm = T)
da_d<-da[which(da$pov9>b),]
sum(table(da_d$id)>=8)

n<-length(country)
gini<-vector(length=n)
for (i in 1:n)
{
  da_1<-da[which(da$id==country[i]),]
  gini[i]<-mean(da_1$pov9,na.rm = T)
}
a<-mean(da$pov9,na.rm = T)
gini[1]<-0
gini[16]<-0
gini[4]<-0
gini[36]<-0
gini[44]<-0
gini[127]<-0

sum(gini>a)



da$id<-as.character(da$id)
write.dta(da,file="da_rolling_f.dta")
religion<-read.dta13("religion.dta")
religion<-religion[which(religion$year==2015),]


##提取两年的gini系数----
da<-read.csv("gini.csv",header = T)[,c(1,2)]
colnames(da)<-c("id","value")
da1<-read.dta13("da_rolling.dta")
da_gini<-da1[,c(1,2,3)]
pov2005<-da1[which(da1$year==2000),c(1,3)]
pov2015<-da1[which(da1$year==2010),c(1,3)]
da<-merge(da,pov2005,by="id")
da<-merge(da,pov2015,by="id")
da<-merge(da,wide,by="id")
colnames(da)<-c("id","value","gini2000","gini2010","ginimean")
write.csv(da,file="gini模板.csv")


wide <- reshape(da_gini, v.names = "pov9", idvar = "id",
                      timevar = "year", direction = "wide")
wide1<-wide[,-1]
wide$ginimean<-rowMeans(wide1,na.rm = T)
rowSums(wide1,na.rm = T)
wide<-wide[,c(1,27)]


###添加国家性质----
da1<-read.csv("gini模板_original.csv",header = T)
da2<-read.csv("gini模板_rolling.csv",header = T)
da3<-read.dta13("dpi_system.dta")[,2:3]
da4<-read.dta13("colonial.dta")[,2:3]
colnames(da3)<-c("id","dpi_system")
da1<-merge(da1,da3,by="id")
da2<-merge(da2,da3,by="id")
da1<-merge(da1,da4,by="id")
da2<-merge(da2,da4,by="id")
write.csv(da1,file="gini模板_original.csv")
write.csv(da2,file="gini模板_rolling.csv")




